library(igraph)
library(sqldf)
library(plyr)

source('https://raw.githubusercontent.com/fraba/R_cheatsheet/master/database.R')
source('https://raw.githubusercontent.com/fraba/R_cheatsheet/master/network.R')


# M5S

load("02_15_m5s_forum_apr15_comment.RData")
load("02_15_m5s_forum_apr15_thread.RData")
load("02_15_m5s_forum_apr15_author_omega.RData")
load("02_23_m5s_forum_direct_reply.RData")

## User -> Thread 

# Create nodes dataframe for authors
nodes_author <- subset(author_omega, select = c("id","name","gender","member","timestamp"))
nodes_author$type <- FALSE
nodes_author$reactions <- NA
nodes_author$dislikes <- NA
nodes_author$createdAt <- NA
nodes_author$slug <- NA
nodes_author$postNumber <- NA
nodes_author$likes <- NA
nodes_author$message <- NA
nodes_author$score <- NA
nodes_author$categoryLink <- NA
nodes_author$url <- author_omega$id

# Create nodes dataframe for threads
nodes_thread <- subset(thread, 
                       select = c("link","title","reactions","dislikes","createdAt",
                                  "slug","postNumber","likes","message","score","categoryLink","timestamp"))
nodes_thread <- rename(nodes_thread, c("link"="id","title"="name"))
nodes_thread$type <- TRUE
nodes_thread$gender <- NA
nodes_thread$member <- NA
nodes_thread$url <- thread$link

# Merge two node sets
nodes <- rbind(nodes_author,nodes_thread)
# rm(nodes_author);rm(nodes_thread)

# Create edges dataframe for proposals
edges_proposal <- thread[,c("authorUrl","link","createdAt","title")]
edges_proposal <- rename(edges_proposal, c("authorUrl"="from", "link"="to"))
edges_proposal$type <- TRUE
edges_proposal$parent <- NA
edges_proposal$dislikes <- NA
edges_proposal$message <- NA
edges_proposal$numReports <- NA
edges_proposal$likes <- NA
edges_proposal$url <- thread$link
edges_proposal$textId <- thread$threadId

# Create edges dataframe for comments
edges_comment <- sqldf('SELECT 
                       comment.authorUrl AS source, 
                       thread.link AS target, 
                       comment.createdAt AS createdAt, 
                       comment.parent AS parent, 
                       comment.dislikes AS dislikes,
                       comment.message AS message,
                       comment.numReports AS numReports,
                       comment.likes AS likes,
                       comment.commentId AS textId,
                       thread.link AS url
                       FROM comment JOIN thread ON comment.threadId=thread.threadId')
edges_comment <- rename(edges_comment, c("source"="from","target"="to"))
edges_comment$type <- FALSE
edges_comment$title <- NA

# # Merge two edge sets
edges <- rbind(edges_proposal, edges_comment)
# rm(edges_proposal); rm(edges_comment)

duplicated_nodes <- sum(duplicated(nodes$id))
# nodes <- nodes[!(duplicated(nodes$id)),]

# Remove nodes with no ID
nodes_no_id <- sum(nodes$id == "")
nodes <- nodes[nodes$id != "",]

# Remove empty edges
edges_no_from <- sum(edges$from=="")
edges <- edges[edges$from!="",]
edges_no_to <- sum(edges$to=="")
# edges <- edges[edges$to!="",]
edges_no_match <- sum(is.na(match(edges$from,nodes$id))==TRUE)
# edges <- edges[-which(is.na(match(edges$from,nodes$id))),]

edges$createdAt <- as.Date(edges$createdAt, format = "%Y-%m-%dT%H:%M:%S")

require(dplyr)
m5s_first_last_post <- 
  edges %>%
  dplyr::group_by(from) %>%
  dplyr::summarize(first = min(createdAt),
                   last = max(createdAt))
m5s_first_last_post$diff <- 
  as.numeric(m5s_first_last_post$last - m5s_first_last_post$first)
m5s_first_last_post_tbl <-
  as.data.frame(table(m5s_first_last_post$diff))

g_m5s_bi <- graph.data.frame(edges, directed=TRUE, vertices=nodes)
# g_m5s_proj_user <- bipartite.projection(g_m5s_bi, which = 'true')
# g_m5s_proj_thread <- bipartite.projection(g_m5s_bi, which = 'false')

# Piraten
load("02_24_pp_direct_reply_graph.RData")
# g_pp_direct_reply <- upgrade_graph(directreply_graph)
# rm(directreply_graph)
# save(g_pp_direct_reply, file = "/mnt/rstudio_wd/thesis/pp_direct_reply_graph.RData")

post <- sqliteGetTable('forum_piraten.sqlite', "post")
edges <- post[,c('authorId', 'threadId', 'postDatetime')]
edges$postDatetime <-as.Date(edges$postDatetime)
  
nodes <- data.frame(id = c(post$authorId, post$threadId),
                    type = c(rep(FALSE, nrow(post)), rep(TRUE, nrow(post))),
                    stringsAsFactors = F)
g_piraten_bi <- graph.data.frame(edges, directed=TRUE, vertices=unique(nodes))

piraten_first_last_post <- 
  edges %>%
  dplyr::group_by(authorId) %>%
  dplyr::summarize(first = min(postDatetime),
                   last = max(postDatetime))
piraten_first_last_post$diff <- 
  as.numeric(piraten_first_last_post$last - piraten_first_last_post$first)
piraten_first_last_post_tbl <-
  as.data.frame(table(piraten_first_last_post$diff))

diff_tbl <- 
  merge(m5s_first_last_post_tbl, 
        piraten_first_last_post_tbl, by = "Var1", all = TRUE)
names(diff_tbl) <- c("Permanence", "M5S", "PP")
diff_tbl$M5S[is.na(diff_tbl$M5S)] <- 0
diff_tbl$PP[is.na(diff_tbl$PP)] <- 0

require(reshape2)
diff_tbl_melted <- melt(diff_tbl, id.vars = 'Permanence')

# Degree distribution
source('https://raw.githubusercontent.com/fraba/R_cheatsheet/master/network.R')
g_m5s_bi <- addDegreeToVertices(g_m5s_bi)
g_pp_bi <- addDegreeToVertices(g_piraten_bi)

g_m5s_bi_df <- vertexAttributesAsDataFrame(g_m5s_bi)
g_pp_bi_df <- vertexAttributesAsDataFrame(g_pp_bi)

# Project
# g_m5s_bi_proj_user <- bipartite.projection(g_m5s_bi, which = 'false')
# g_m5s_bi_proj_thread <- bipartite.projection(g_m5s_bi, which = 'true')
# 
# g_pp_bi_proj_user <- bipartite.projection(g_pp_bi, which = 'false')
# g_pp_bi_proj_thread <- bipartite.projection(g_pp_bi, which = 'true')


## Plot
outdegree_tbl <- as.data.frame(table(subset(g_m5s_bi_df, type == FALSE)$outdegree),
                               stringsAsFactors = F)
outdegree_tbl <- merge(outdegree_tbl, 
                       as.data.frame(table(subset(g_pp_bi_df, type == FALSE)$outdegree),
                                     stringsAsFactors = F),
                       by = 'Var1', all = TRUE)

names(outdegree_tbl) <- c('outdegree', 'M5S', 'PP')
outdegree_tbl$M5S[is.na(outdegree_tbl$M5S)] <- 0
outdegree_tbl$PP[is.na(outdegree_tbl$PP)] <- 0
outdegree_tbl <- outdegree_tbl[outdegree_tbl$outdegree != '0',]
outdegree_tbl_melted <- melt(outdegree_tbl, id.vars = c('outdegree'))

indegree_tbl <- as.data.frame(table(subset(g_m5s_bi_df, type == TRUE)$indegree),
                               stringsAsFactors = F)
indegree_tbl <- merge(indegree_tbl, 
                       as.data.frame(table(subset(g_pp_bi_df, type == TRUE)$indegree),
                                     stringsAsFactors = F),
                       by = 'Var1', all = TRUE)

names(indegree_tbl) <- c('indegree', 'M5S', 'PP')
indegree_tbl$M5S[is.na(indegree_tbl$M5S)] <- 0
indegree_tbl$PP[is.na(indegree_tbl$PP)] <- 0
indegree_tbl <- indegree_tbl[indegree_tbl$indegree != '0',]
indegree_tbl_melted <- melt(indegree_tbl, id.vars = c('indegree'))

# plots
## Degrees
require(ggplot2)
ggplot(outdegree_tbl_melted, aes(x=as.numeric(outdegree), y=value, colour=variable)) +
  geom_point() +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  annotation_logticks() +
  theme_bw()

ggplot(direct_reply_outdegree_tbl_melted, aes(x=as.numeric(outdegree), y=value, colour=variable)) +
  geom_point(shape = 1) +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  annotation_logticks() +
  theme_bw()

# Time from first to last post
ggplot(m5s_first_last_post_tbl, aes(x=as.numeric(Var1)+1, y=Freq)) +
  geom_line()  +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  annotation_logticks() +
  theme_bw()

ggplot(piraten_first_last_post_tbl, aes(x=as.numeric(Var1)+1, y=Freq)) +
  geom_line()  +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  annotation_logticks() +
  theme_bw()

ggplot(diff_tbl_melted, aes(x=as.numeric(Permanence), y=value, colour=variable)) +
  geom_point(shape = 1)  +
  scale_x_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  scale_y_log10(
    breaks = scales::trans_breaks("log10", function(x) 10^x),
    labels = scales::trans_format("log10", scales::math_format(10^.x))
  ) +
  annotation_logticks() +
  theme_bw()

bi_outdegree_tbl_melted <- outdegree_tbl_melted
bi_indegree_tbl_melted <- indegree_tbl_melted

# Directed 
source('https://raw.githubusercontent.com/fraba/R_cheatsheet/master/network.R')
g_m5s_direct_reply <- addDegreeToVertices(g_m5s_direct_reply)
g_pp_direct_reply <- addDegreeToVertices(g_pp_direct_reply)

g_m5s_direct_reply_df <- vertexAttributesAsDataFrame(g_m5s_direct_reply)
g_pp_direct_reply_df <- vertexAttributesAsDataFrame(g_pp_direct_reply)

outdegree_tbl <- as.data.frame(table(subset(g_m5s_direct_reply_df)$outdegree),
                               stringsAsFactors = F)
outdegree_tbl <- merge(outdegree_tbl, 
                       as.data.frame(table(subset(g_pp_direct_reply_df)$outdegree),
                                     stringsAsFactors = F),
                       by = 'Var1', all = TRUE)

names(outdegree_tbl) <- c('outdegree', 'M5S', 'PP')
outdegree_tbl$M5S[is.na(outdegree_tbl$M5S)] <- 0
outdegree_tbl$PP[is.na(outdegree_tbl$PP)] <- 0
outdegree_tbl <- outdegree_tbl[outdegree_tbl$outdegree != '0',]
outdegree_tbl_melted <- melt(outdegree_tbl, id.vars = c('outdegree'))

indegree_tbl <- as.data.frame(table(g_m5s_direct_reply_df$indegree),
                              stringsAsFactors = F)
indegree_tbl <- merge(indegree_tbl, 
                      as.data.frame(table(g_pp_direct_reply_df$indegree),
                                    stringsAsFactors = F),
                      by = 'Var1', all = TRUE)

names(indegree_tbl) <- c('indegree', 'M5S', 'PP')
indegree_tbl$M5S[is.na(indegree_tbl$M5S)] <- 0
indegree_tbl$PP[is.na(indegree_tbl$PP)] <- 0
indegree_tbl <- indegree_tbl[indegree_tbl$indegree != '0',]
indegree_tbl_melted <- melt(indegree_tbl, id.vars = c('indegree'))


direct_reply_outdegree_tbl_melted <- outdegree_tbl_melted
direct_reply_indegree_tbl_melted <- indegree_tbl_melted

save(bi_outdegree_tbl_melted, bi_indegree_tbl_melted, 
     m5s_first_last_post_tbl, piraten_first_last_post_tbl,
     direct_reply_outdegree_tbl_melted, direct_reply_indegree_tbl_melted,
     diff_tbl_melted, 
     file = "01_m5s_pp_fora_nets_df_for_ggplot.RData")  

# Powerlaw checks
powerLawTest <- function(data) {
  require(poweRlaw)
  data_pl <- displ$new(data)
  bs <- bootstrap_p(data_pl)
  return(bs$p)
}
powerLawTest(outdegree_tbl$M5S+1)

net_statistics <- 
  list('m5s' = list(), 'pp' = list())

net_statistics$m5s$bi <- list() 
net_statistics$m5s$bi$nodes <- vcount(g_m5s_bi)
net_statistics$m5s$bi$edges <- ecount(g_m5s_bi)
net_statistics$m5s$bi$components <- count_components(g_m5s_bi)
net_statistics$m5s$bi$perc_in_largest_component <- 
  max(components(g_m5s_bi)$csize) / vcount(g_m5s_bi)
net_statistics$m5s$bi$users_per_thread <- 
  sum(V(g_m5s_bi)$type==FALSE) / sum(V(g_m5s_bi)$type==TRUE)
net_statistics$m5s$bi$mean_distance <- mean_distance(g_m5s_bi)
net_statistics$m5s$bi$transitivity <- transitivity(g_m5s_bi)

# net_statistics$m5s$bi_proj_user <- list() 
# net_statistics$m5s$bi_proj_user$nodes <- vcount(g_m5s_bi_proj_user)
# net_statistics$m5s$bi_proj_user$edges <- ecount(g_m5s_bi_proj_user)
# net_statistics$m5s$bi_proj_user$components <- count_components(g_m5s_bi_proj_user)
# net_statistics$m5s$bi_proj_user$perc_in_largest_component <- 
#   max(components(g_m5s_bi_proj_user)$csize) / vcount(g_m5s_bi_proj_user)
# net_statistics$m5s$bi_proj_user$users_per_thread <- 
#   sum(V(g_m5s_bi_proj_user)$type==FALSE) / sum(V(g_m5s_bi_proj_user)$type==TRUE)
# net_statistics$m5s$bi_proj_user$mean_distance <- mean_distance(g_m5s_bi_proj_user)
# net_statistics$m5s$bi_proj_user$transitivity <- transitivity(g_m5s_bi_proj_user)
# 
# net_statistics$m5s$bi_proj_thread <- list() 
# net_statistics$m5s$bi_proj_thread$nodes <- vcount(g_m5s_bi_proj_thread)
# net_statistics$m5s$bi_proj_thread$edges <- ecount(g_m5s_bi_proj_thread)
# net_statistics$m5s$bi_proj_thread$components <- count_components(g_m5s_bi_proj_thread)
# net_statistics$m5s$bi_proj_thread$perc_in_largest_component <- 
#   max(components(g_m5s_bi_proj_thread)$csize) / vcount(g_m5s_bi_proj_thread)
# net_statistics$m5s$bi_proj_thread$threads_per_thread <- 
#   sum(V(g_m5s_bi_proj_thread)$type==FALSE) / sum(V(g_m5s_bi_proj_thread)$type==TRUE)
# net_statistics$m5s$bi_proj_thread$mean_distance <- mean_distance(g_m5s_bi_proj_thread)
# net_statistics$m5s$bi_proj_thread$transitivity <- transitivity(g_m5s_bi_proj_thread)

net_statistics$m5s$direct_reply <- list()
net_statistics$m5s$direct_reply$nodes <- vcount(g_m5s_direct_reply)
net_statistics$m5s$direct_reply$edges <- ecount(g_m5s_direct_reply)
net_statistics$m5s$direct_reply$components <- count_components(g_m5s_direct_reply)
net_statistics$m5s$direct_reply$perc_in_largest_component <-
  max(components(g_m5s_direct_reply)$csize) / vcount(g_m5s_direct_reply)
net_statistics$m5s$direct_reply$users_per_thread <-
  sum(V(g_m5s_direct_reply)$type==FALSE) / sum(V(g_m5s_direct_reply)$type==TRUE)
net_statistics$m5s$direct_reply$mean_distance <- mean_distance(g_m5s_direct_reply)
net_statistics$m5s$direct_reply$transitivity <- transitivity(g_m5s_direct_reply)
net_statistics$m5s$direct_reply$cor_in_outdegree <- 
  cor(V(g_m5s_direct_reply)$indegree, V(g_m5s_direct_reply)$outdegree)

net_statistics$pp$bi <- list() 
net_statistics$pp$bi$nodes <- vcount(g_piraten_bi)
net_statistics$pp$bi$edges <- ecount(g_piraten_bi)
net_statistics$pp$bi$components <- count_components(g_piraten_bi)
net_statistics$pp$bi$perc_in_largest_component <- 
  max(components(g_piraten_bi)$csize) / vcount(g_piraten_bi)
net_statistics$pp$bi$users_per_thread <- 
  sum(V(g_piraten_bi)$type==FALSE) / sum(V(g_piraten_bi)$type==TRUE)
net_statistics$pp$bi$mean_distance <- mean_distance(g_piraten_bi)
net_statistics$pp$bi$transitivity <- transitivity(g_piraten_bi)

# net_statistics$pp$bi_proj_user <- list() 
# net_statistics$pp$bi_proj_user$nodes <- vcount(g_pp_bi_proj_user)
# net_statistics$pp$bi_proj_user$edges <- ecount(g_pp_bi_proj_user)
# net_statistics$pp$bi_proj_user$components <- count_components(g_pp_bi_proj_user)
# net_statistics$pp$bi_proj_user$perc_in_largest_component <- 
#   max(components(g_pp_bi_proj_user)$csize) / vcount(g_pp_bi_proj_user)
# net_statistics$pp$bi_proj_user$users_per_thread <- 
#   sum(V(g_pp_bi_proj_user)$type==FALSE) / sum(V(g_pp_bi_proj_user)$type==TRUE)
# net_statistics$pp$bi_proj_user$mean_distance <- mean_distance(g_pp_bi_proj_user)
# net_statistics$pp$bi_proj_user$transitivity <- transitivity(g_pp_bi_proj_user)

# net_statistics$pp$bi_proj_thread <- list() 
# net_statistics$pp$bi_proj_thread$nodes <- vcount(g_pp_bi_proj_thread)
# net_statistics$pp$bi_proj_thread$edges <- ecount(g_pp_bi_proj_thread)
# net_statistics$pp$bi_proj_thread$components <- count_components(g_pp_bi_proj_thread)
# net_statistics$pp$bi_proj_thread$perc_in_largest_component <- 
#   max(components(g_pp_bi_proj_thread)$csize) / vcount(g_pp_bi_proj_thread)
# net_statistics$pp$bi_proj_thread$threads_per_thread <- 
#   sum(V(g_pp_bi_proj_thread)$type==FALSE) / sum(V(g_pp_bi_proj_thread)$type==TRUE)
# net_statistics$pp$bi_proj_thread$mean_distance <- mean_distance(g_pp_bi_proj_thread)
# net_statistics$pp$bi_proj_thread$transitivity <- transitivity(g_pp_bi_proj_thread)

net_statistics$pp$direct_reply <- list()
net_statistics$pp$direct_reply$nodes <- vcount(g_pp_direct_reply)
net_statistics$pp$direct_reply$edges <- ecount(g_pp_direct_reply)
net_statistics$pp$direct_reply$components <- count_components(g_pp_direct_reply)
net_statistics$pp$direct_reply$perc_in_largest_component <-
  max(components(g_pp_direct_reply)$csize) / vcount(g_pp_direct_reply)
net_statistics$pp$direct_reply$users_per_thread <-
  sum(V(g_pp_direct_reply)$type==FALSE) / sum(V(g_pp_direct_reply)$type==TRUE)
net_statistics$pp$direct_reply$mean_distance <- mean_distance(g_pp_direct_reply)
net_statistics$pp$direct_reply$transitivity <- transitivity(g_pp_direct_reply)
net_statistics$pp$direct_reply$cor_in_outdegree <- 
  cor(V(g_pp_direct_reply)$indegree, V(g_pp_direct_reply)$outdegree)

save(net_statistics, file = '02_25_m5s_pp_fora_net_stats_list.RData')
